import os
from datetime import datetime
import time
from tqdm import tqdm
import pandas as pd
import spacy
import re
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sentence_transformers import SentenceTransformer
# from umap import UMAP
from cuml import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# from hdbscan import HDBSCAN
from cuml.cluster.hdbscan import HDBSCAN
import plotly.io as pio
pio.renderers.default = "notebook+vscode+jupyterlab"
sns.set_theme(style="darkgrid")
# %config InlineBackend.figure_format = "retina"
# Dictionaries:
# en_core_web_sm
# en_core_web_md
# en_core_web_lg
# en_core_web_trf
nlp = spacy.load(
"en_core_web_sm",
exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
)
spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)Dynamic Topic Modelling of r/politics subreddit
This project allows:
- To clean gathered data and explore it.
- To extract the main topics from gathered data and visualise them.
- To visualise dynamic changes of topics over time.
To extract topics, we use BERTopic library, which performs topic modeling using clustering of vector representations of documents. The main differences between BERTopic and other topic models:
- High speed due to reducing the dimensionality of vector representations.
- Modular structure of the model pipeline: the stages of vectorization, dimensionality reduction and clustering are separated from each other, which allows you to easily and quickly experiment with different combinations of algorithm settings.
- The model pipeline consists of SOTA tools: SBERT, UMAP, HDBSCAN. Combined, this allows you to get the best results compared to other models.
This project can be easily adjusted to other sources of information, which allows you to conduct different experiments.
Install libraries
We use cuml implementation of HDBSCAN and UMAP to speed up dimensionality reduction and clustering of data with a power of GPU.
Load and clean data from csvs
BERTopic uses SBERT. The model learns better if it receives more information from the text. Therefore, data preprocessing is minimal.
Function to clean data from HTML elements using regular expressions
def regex_preprocessing(text):
# Remove URL
text = re.sub(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
" ",
text,
)
text = re.sub(
r"\(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\)",
" ",
text,
)
# Remove special symbols
text = re.sub(r"\n|\r|\<.*?\>|\{.*?\}|u/|\(.*emote.*\)|\[gif\]|/s|_", " ", text)
text = re.sub(r"[^\w0-9'’“”%!?.,-:*()><]", " ", text)
# Remove unnecessary brackets
text = re.sub(r"\s\(\s", " ", text)
# Delete unnecessary whitespaces
text = re.sub(r"\s+", " ", text)
return text.strip()Function to convert data to a dataframe, drop duplicates in the dataframe and to apply ‘regex_preprocessing’ function to data
def data_preprocessing(file_name):
data = pd.read_csv(file_name)
data_cleaned = data.drop_duplicates(keep=False)
data_cleaned["comments"] = data_cleaned["comments"].apply(regex_preprocessing)
return data_cleanedFunction to create a dataframe with a cleaned data
This function consists of several steps:
- Firstly, it gets names of csv files in a chosen folder
- Secondly, it applies ‘data_preprocessing’ function to csv’s to create dataframes with cleaned data
- Lastly, it creates a combined dataframe with cleaned data
def process_data(directory):
file_names = []
for filename in os.listdir(directory):
file = os.path.join(directory, filename)
file_names.append(file)
file_names.sort()
dataframes = []
for name in file_names:
dataframes.append(data_preprocessing(name))
cleaned_df = (
pd.concat(dataframes)
.drop(columns="time", axis=1)
.reset_index(drop=True)
.drop_duplicates()
.dropna()
)
return cleaned_dfApply data processing functions to gathered data
For this experiment, we load cvs’s with data marked as ‘hot’ by reddit algorithms.
directory = "original_data/hot"
combined_df = process_data(directory)
len(combined_df["comments"].to_list())264230
Convert the dataframe to a list for a further work
comments = combined_df["comments"].to_list()
timestamps = combined_df["date"].to_list()Create embeddings from cleaned data
The gte-small model was chosen using the Hugging Face benchmark. It is lightweight and works well with Reddit data.
# Pre-calculate embeddings
# dunzhang/stella_en_400M_v5
# thenlper/gte-small
embedding_model = SentenceTransformer(
model_name_or_path="thenlper/gte-small",
cache_folder="transformers_cache",
)
embeddings = embedding_model.encode(comments, show_progress_bar=True)Plot data distribution
We use UMAP to reduce the dimensionality of data, which makes it easier to cluster it using HDBSCAN.
def plot_umap(embeddings, values):
neighbors_list = values
fig, axes = plt.subplots(2, 5, figsize=(27, 10), sharex=True, sharey=True)
axes = axes.flatten()
for ax, neighbors in tqdm(zip(axes, neighbors_list)):
umap_model = UMAP(
n_neighbors=neighbors, n_components=2, min_dist=0.0, metric="cosine"
)
# Apply UMAP to our data
umap_result = umap_model.fit_transform(embeddings)
# Visualise the results
ax.scatter(
umap_result[:, 0], umap_result[:, 1], alpha=0.15, c="orangered", s=0.1
)
ax.set_title(f"UMAP, n_neighbors = {neighbors}")
ax.set_xlabel("component 1")
ax.set_ylabel("component 2")
lim = 7
plt.ylim(-lim, lim)
plt.xlim(-lim, lim)
plt.tight_layout()
plt.show()
def plot_hdbscan(embeddings, umap_values, hdbscan_values):
for n in umap_values:
# Apply UMAP to our data
umap_model = UMAP(n_neighbors=n, n_components=2, min_dist=0.0, metric="cosine")
umap_result = umap_model.fit_transform(embeddings)
# HDBSCAN
sizes = hdbscan_values
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharex=True, sharey=True)
axes = axes.flatten()
for ax, size in tqdm(zip(axes, sizes)):
# Cluster data with HDBSCAN
hdbscan_model = HDBSCAN(
min_cluster_size=size, metric="euclidean", prediction_data=True
)
hdbscan_labels = hdbscan_model.fit_predict(umap_result)
# Create a dataframe with results of UMAP and HDBSCAN
df = pd.DataFrame(
umap_result, columns=[f"UMAP{i+1}" for i in range(0, 2, 1)]
)
df["Cluster"] = hdbscan_labels
# scatterplot for results
sns.scatterplot(
x="UMAP1",
y="UMAP2",
hue="Cluster",
data=df,
palette="tab10",
legend=None,
linewidth=0,
s=0.5,
ax=ax,
).set_title(f"n_neighbors={n}, min_cluster_size={size}")
ax.set_xlabel("component 1")
ax.set_ylabel("component 2")
lim = 7
plt.ylim(-lim, lim)
plt.xlim(-lim, lim)
plt.tight_layout()
plt.show()We plot a range of values to see how a structure of data changes: from a more local structure to a global one. For topic modelling, it is better to focus on a more local view of data for a more precise topic clustering.
plot_umap(embeddings, np.arange(10, 56, 5))10it [01:02, 6.29s/it]

We can see sizes of created clusters with different parameter combinations. Blue dots are data marked as a noise. Generallym the higher UMAP and HDBSCAN parameters, the higher the size of clusters.
plot_hdbscan(embeddings, [15, 20, 25], [15, 35, 50, 75])4it [01:45, 26.42s/it]

4it [01:52, 28.13s/it]

4it [01:42, 25.57s/it]

Extract topics using BERTopic
In this work, we use MaximalMarginalRelevance topic representation model, which changes the order of words in topics to remove semantic repetitions and create a sequence of the most significant words.
Function for Topic Modelling Pipeline
This function encapsulates all previous steps in a one pipeline: creation of embeddings, dimension reduction and clustering of data. There are two new steps added.
We use CountVectorizer from Scikit-learn to:
- remove very rare and frequent words from the final topic representations.
- create n-grams, up to 3 words in total.
- remove stopwords from topic representations.
We use representation models, such as KeyBERTInspired or MaximalMarginalRelevance, to further fine tuning topic representations.
- MaximalMarginalRelevance model changes the order of words in topics to remove semantic repetitions and create a sequence of the most significant words.
- KeyBERTInspired creates topic representations with words most similar to corresponding documents.
def topic_modelling(n_neighbors, min_cluster_size, representation_name):
# UMAP init
umap_model = UMAP(
n_neighbors=n_neighbors, n_components=5, min_dist=0.0, metric="cosine"
)
# HDBSCAN init
hdbscan_model = HDBSCAN(
min_cluster_size=min_cluster_size, metric="euclidean", prediction_data=True
)
# Remove noise from created topics
vectorizer_model = CountVectorizer(
stop_words="english", min_df=0.03, max_df=0.99, ngram_range=(1, 3)
)
# BERTopic model init
if representation_name == "KeyBERTInspired":
representation_name = KeyBERTInspired()
else:
representation_name = MaximalMarginalRelevance()
representation_model = representation_name
topic_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
verbose=True,
)
# Fit the model
topics, probs = topic_model.fit_transform(comments, embeddings)
# Get topics dataframe
topic_representation = topic_model.get_topic_info()
# 2D-визуализация тем
topics_map = topic_model.visualize_topics()
# Иерархическая визуализация тем
hierarchical_topics = topic_model.hierarchical_topics(comments)
topics_hierarchy = topic_model.visualize_hierarchy(
hierarchical_topics=hierarchical_topics
)
# Get topics over time
topics_over_time = topic_model.topics_over_time(
comments,
timestamps,
datetime_format="%Y_%m_%d",
global_tuning=False,
evolution_tuning=False,
)
# Plot Topics over Time
plot = topic_model.visualize_topics_over_time(
topics_over_time, top_n_topics=15, height=700, width=1200
)
return (
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
)Experiments
This part is purely experimental and required a lot of time to tune hyperparameters of model to get the best ouput results. This is one of the main problems of topic modelling. There is no metric for helping us to choose the best hyperparameters. Also, the best result of modelling may be subjective. That is why we run a series of experiments to have several results.
Generally, hyperparameters should be chosen taking into account several goals:
- To preserve the local structure of the data after reducing the dimensionality of the data with UMAP.
- To reduce the amount of noise in clusters and create an adequate number of topics with HDBSCAN.
- To create a list of understandable topics at the output.
After several rounds of experiments, the best values for UMAP n_neighbors parameter are 25 and 30. The best values for HDBSCAN min_cluster_size are 50, 75, 100. The best topic representation model is KeyBERTInspired.
Because of the subjectivity of final results, below you can see results of model runs with a set of chosen parameters.
25, 50
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(25, 50, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:26:02,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:26:19,847 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:26:19,850 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:26:09.712922] Transform can only be run with brute force. Using brute force.
2024-12-16 05:27:05,246 - BERTopic - Cluster - Completed ✓
2024-12-16 05:27:05,276 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:27:28,829 - BERTopic - Representation - Completed ✓
100%|██████████| 180/180 [00:25<00:00, 6.99it/s]
16it [01:38, 6.17s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_voters_election_campaign_vote',
'0_bernie_bernie sanders_democrat_democrats',
'1_russia_putin_russians_trump putin',
'2_vote vote_vote_vote vote vote_voting',
'3_voted harris_voting harris_vote harris_harris trump',
'4_trump_trump trump_trump supporters_voted trump',
'5_candidate_election_democratic_trump',
'6_mass deportations_mass deportation_illegal immigrant_illegal immigrants',
'7_hitler_fascist_trump hitler_nazi',
'8_pro israel_gaza_palestinians_support israel',
'9_tariffs_trump tariffs_tariff_inflation',
'10_rfk jr_rfk_jr_kennedy',
'11_christianity_christians_christian_religion',
'12_biden_biden biden_biden called_joe biden',
'13_fox news_fox_news outlets_mainstream media',
'14_kamala going_vote kamala_kamala_kamala lost',
'15_states blue_texas_states_blue states',
'16_thank_thanks_welcome_appreciate',
'17_won popular vote_didn vote_voted_got elected',
'18_abortion_abortions_ban abortion_abortion rights',
'19_trans people_transgender people_trans women_trans rights',
'20_national guard_military_guard_army',
'21_america america_america_americans_america fucked',
'22_elon musk_musk_musk trump_trump musk',
'23_vance_vance said_vance doesn_trump vance',
'24_gonna lose_going lose_ll stop_lose',
'25_rapist_rape_raped_raping',
'26_joe rogan_rogan_rogan just_rogan podcast',
'27_women voted_women vote_vote women_women voters',
'28_supreme court_supreme court justices_justices_supreme court ruling',
'29_matt gaetz_gaetz_matt_gabbard',
'30_women_females_women women_ladies',
'31_project 2025_project 2025 plans_project 2025 plan_read project 2025',
'32_democracy isn_democracy_vote democracy_democracy doesn',
'33_genz_gen_gen men_millennials',
'34_voter fraud_election fraud_rigged election_election rigged',
'35_elon_like elon_elon just_trump elon',
'36_epstein_trump epstein_jeffrey epstein_michael',
'37_education_educated_schooling_higher education',
'38_iowa poll_iowa_selzer poll_polls',
'39_leopards eating faces_leopards ate face_leopards_leopards eating',
'40_yes did_yeah did_did yeah_sure did',
'41_obamacare_affordable care act_pre existing conditions_health insurance',
'42_incarcerated_going jail_prison_jailed',
'43_maga_trump maga_maga just_maga going',
'44_twitter_twitter just_twitter don_bought twitter',
'45_guns_firearms_gun_armed',
'46_eggs_egg_price eggs_chickens',
'47_stupidity_stupid_people stupid_stupid people',
'48_georgia_ga_ga nc_atlanta',
'49_white person_white people_people white_racist white',
'50_conservatives_conservative_conservatives think_like conservative',
'51_puerto ricans_puerto rico garbage_puerto rico_puerto rican',
'52_eating cats dogs_eating dogs_dogs cats_eating cats',
'53_merrick garland_garland_merrick_partisan',
'54_canada_canadian_canada mexico_toronto',
'55_tim walz_walz_tim_timothy',
'56_recess appointments_recess_appointments_appointment',
'57_headline_read headline_headlines like_article isn',
'58_reagan_reagan trump_ronald reagan_regan',
'59_echo chamber_echo chambers_echo chamber reddit_reddit echo chamber',
'60_liz cheney_like liz cheney_campaigning liz cheney_cheney',
'61_billionaires_billionaire_wealthy_billionaire class',
'62_betting markets_betting_bets_gambling',
'63_jill stein_stein_jill_green party',
'64_states rights_state rights_state right_states right',
'65_stock_money trump_puts_stock market',
'66_war hawks_military_trump said_war hawk',
'67_shocked_shocked shocked_shocking_shocked tell',
'68_porn_pornography_ban porn_obscene',
'69_tucker carlson_tucker_carlson_fox',
'70_fbi_background checks_security clearances_security clearance',
'71_orange_orange man_orange man bad_want orange',
'72_joke_joking_just joke_jokes',
'73_years years_years probably_years_probably years',
'74_2016 wasn_2016_2016 2016_happened 2016',
'75_garbage truck_dump truck_truck_garbage',
'76_pardon trump_pardoning_pardon_pardons',
'77_fluoride_water_chemicals_drink',
'78_normal gay_gay_gay guy_gay gay',
'79_mom_parent_mother_mum',
'80_taxes_paying taxes_tax_taxpayer',
'81_signs trump_trump sign_trump signs_harris signs',
'82_clown_clowns_fucking clown_clown car',
'83_climate change_global warming_climate_warming',
'84_mom voted_voting trump_parents voted_vote trump',
'85_idiocracy_idiocy_intelligence_idiots',
'86_wins_win_win win_winning',
'87_garbage_garbage garbage_called garbage_people garbage',
'88_dictator day_dictator_dictator life_going dictator',
'89_celebrity endorsements_endorsements_endorsement_endorsed',
'90_newsom_gavin newsom_newsome_republican',
'91_economist_economists_economics_better economy',
'92_brain worm_brain worms_worm_worms',
'93_crosses_cross_crusade_symbols',
'94_housing crisis_housing_housing market_buy house',
'95_gingrich_newt_paul ryan_politics',
'96_fema_hurricanes_hurricane_trump',
'97_red mirage_mirage_red wave_red',
'98_john bolton_bolton_worse trump_trump bad',
'99_oligarchy_oligarchs_oligarch_oligarchic',
'100_vote blue_voting blue_blue vote_voted blue',
'101_dementia_dementia don_demented_cognitive decline',
'102_fraud_fraudulent_frauds_fraud claims',
'103_robinson_mark robinson_sack_sack shit',
'104_filibuster_house senate_senate_senate republicans',
'105_reddit_reddits_reddit just_know reddit',
'106_tiktok_youtube tiktok_tik tok_tik',
'107_pence_mike pence_trump pence_republican',
'108_immigration_immigrate_visas_visa',
'109_lottery_fraud_fraudulently_scam',
'110_stephen miller_miller_stephen_steven',
'111_magats_magat_fuckin_post',
'112_sad_way sad_really sad_just sad',
'113_blowing microphone_mic_microphone_microphones',
'114_laws_law_laws make_law does',
'115_vote day_voting day_day election day_election day',
'116_woke_woke stuff_anti woke_wokeness',
'117_bots_bot_spamming_trolls',
'118_terrifying_frightening_scary_fucking scary',
'119_let burn_burn_burning_burned',
'120_pelosi_nancy pelosi_wasn wrong_wrong',
'121_jim jordan_jordan_jim_gym',
'122_people hate_hurting people_hate people_people going suffer',
'123_ai_artificial_intelligence_human',
'124_voting age_voters young_younger voters_young voters',
'125_owning libs_owned libs_libs_just libs',
'126_bezos_like bezos_jeff bezos_amazon',
'127_percent_100 percent_percentages_15',
'128_felon president_convicted felon_convicted felons_felon convicted',
'129_golf_play golf_golfing_golf courses',
'130_missile defense_missile_missiles_herschel walker',
'131_businessman_failed businessman_richest man_business',
'132_leak_leaking_leaked_leaks',
'133_civil war_american civil_confederate_like civil',
'134_mitch mcconnell_mcconnell_mitch_senate',
'135_palpatine_star wars_empire_wars',
'136_taliban_afghanistan_afghan_osama',
'137_little late_late_far late_better late',
'138_lie_lie lie_lied_lying',
'139_working class_working class people_working class american_working class americans',
'140_jail trump_trump prison_trump jail_time trump',
'141_newsweek_journalism_read_trump article',
'142_cheating_cheating just_didn cheat_cheated',
'143_shapiro_ben shapiro_josh shapiro_whitmer shapiro',
'144_brexit_britain_uk_brits',
'145_susan_collins_amy_shocked',
'146_plz_spread word__',
'147_alex jones_jones_infowars_alex',
'148_social security_medicare social security_social security medicare_medicare social',
'149_lindsey graham_graham_lindsey_lindsay graham',
'150_elmo_twitter trump_half things_twitter',
'151_swamp_drain swamp_draining_drained',
'152_monarchy_nobility_aristocracy_prince',
'153_cheated wife_cheated wives_wife_wives',
'154_government efficiency_efficiency_efficient_dept',
'155_tells like_telling like_like told_tell like',
'156_john oliver_oliver_john_mainstream media',
'157_onion_onions_satire_shit says',
'158_lawsuits_lawsuit_sued_lawyer',
'159_domestic terrorists_domestic terrorism_domestic terrorist_terrorists',
'160_drunk_sober_drinking_alcohol',
'161_incels_incel_cucks_misogynists',
'162_insane_fucking insane_insane just_insane like',
'163_electoral college_electoral college popular_vote electoral_electoral',
'164_video_videos_video clip_youtube',
'165_student loan forgiveness_loan forgiveness_student loans_student loan',
'166_steve bannon_bannon_steve_giuliani',
'167_wants win_anymore won_going win_like won',
'168_corruption_corrupt_corrupt people_corrupt politicians',
'169_black voters_black vote_black people_blacks',
'170_metric_imperial_measure_like 15',
'171_citizens united_citizen united_citizens_rights citizens',
'172_couch_couches_room_bed',
'173_arnold palmer_palmer_arnold_dad did',
'174_won swing states_states won_chance winning_win election',
'175_podcasters_podcast_podcasts_think democrats',
'176_copium_hopium_smoking_share',
'177_hulk hogan_hogan_hulk_wwe',
'178_governor_governor state_governors_state',
'179_economy democrats_republicans_fucked republican_bad economy',
'180_leon_awww_yeah_hero']
25, 75
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(25, 75, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:29:39,747 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:29:56,072 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:29:56,075 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:29:46.239227] Transform can only be run with brute force. Using brute force.
2024-12-16 05:30:43,610 - BERTopic - Cluster - Completed ✓
2024-12-16 05:30:43,639 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:31:03,527 - BERTopic - Representation - Completed ✓
100%|██████████| 121/121 [00:21<00:00, 5.65it/s]
16it [01:30, 5.68s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_trump_election_republicans',
'0_thank_thanks_answer_years',
'1_bernie_bernie sanders_democrats_republicans',
'2_inflation_economics_tariffs_gas prices',
'3_vote_vote vote_voting_vote vote vote',
'4_elon musk_elon_like elon_musk',
'5_voted harris_voting harris_vote harris_harris did',
'6_hillary_candidate_election_campaign',
'7_russia_russians_putin_like russia',
'8_women_think women_women vote_hate women',
'9_hitler_like hitler_nazi_fascist',
'10_gaza_israel_palestinians_palestine',
'11_rfk jr_rfk_kennedy_anti vaxxer',
'12_christianity_christians_church_religion',
'13_going lose_loses_lose_hopefully',
'14_biden_biden biden_saying biden_blame biden',
'15_trump voters_voting trump_trump supporters_trump supporter',
'16_fox news_fox_fake news_msnbc',
'17_kamala going_kamala_vote kamala_kamala campaign',
'18_states blue_blue states_texas_states red',
'19_abortion_ban abortion_abortion ban_abortions',
'20_won popular vote_win popular vote_got elected_popular vote',
'21_military_use military_national guard_army',
'22_trans people_transgender_transphobic_transgender people',
'23_illegal immigrant_illegal immigrants_deportations_deportation',
'24_america_america america_america really_americans',
'25_vance_vance vance_vance going_vance said',
'26_trump rapist_rapist_pedophile rapist_rapist president',
'27_repeal aca_obamacare_affordable care act_health insurance',
'28_joe rogan_rogan_like joe rogan_rogan just',
'29_matt gaetz_gaetz_gaetz just_gaetz ag',
'30_supreme court_court supreme court_justices_court supreme',
'31_voter fraud_election fraud_fraud election_rigged election',
'32_project 2025 said_project 2025 doesn_project 2025 just_project 2025 people',
'33_democracy_democracy democracy_democracy isn_vote democracy',
'34_iowa poll_iowa_wins iowa_iowan',
'35_education_educate_public education_schooling',
'36_millennials_younger generation_generation_gen gen',
'37_gun ownership_guns_buy gun_firearm',
'38_going prison_going jail_prison_sentenced',
'39_epstein_epsteins_epstein trump_jeffrey epstein',
'40_maga_maga people_maga maga_maga doing',
'41_recess appointments_recess_appointments_appointment trump',
'42_eggs_egg_eggs just_eggs going',
'43_conservatives_conservatives like_conservative_conservatives think',
'44_georgia north carolina_nc ga_ga nc_georgia',
'45_leopards face_leopards ate face_leopards eating faces_leopards eat faces',
'46_whites_white_white person_non white',
'47_puerto rico joke_puerto ricans_puerto rican_puerto rico comments',
'48_merrick garland_biden garland_garland_merrick',
'49_article isn_article actually_misleading headline_headline says',
'50_canadians_canada_canadian_moving canada',
'51_twitter_twitter used_deleted twitter_using twitter',
'52_reagan_ronald reagan_reagan trump_like reagan',
'53_cooked_cooked just_cook_let eat',
'54_betting markets_betting market_like betting_betting',
'55_tim walz_walz did_walz actually_walz said',
'56_jill stein_jill stein voters_vote stein_voted stein',
'57_porn_pornographic_pornography_porn site',
'58_latinos_latino_latino people_latinos trump',
'59_war hawks_trump said_military_guns',
'60_dad_parent_mom_parents',
'61_clown_clowns_clown circus_fucking clown',
'62_tucker carlson_tucker_like tucker_carlson said',
'63_pardon_pardoning_going pardon_pardon trump',
'64_garbage truck_drive garbage truck_driving garbage truck_driving garbage',
'65_buy stock_stock_pump dump_money trump',
'66_fbi background checks_fbi background check_fbi background_fbi',
'67_newsom_think newsom_gavin newsom_newsome',
'68_states rights_state rights_rights state_state right',
'69_orange_orange man_orange man bad_orange shit',
'70_red mirage_mirage_red_like red',
'71_fluoride water_fluoride_remove fluoride_water',
'72_tulsi gabbard_gabbard russian_intelligence tulsi_gabbard',
'73_idiocracy_idiocy_intelligence_idiots think',
'74_normal gays_normal gay_gay_normal gay guys',
'75_climate change going_climate change_global warming_climate',
'76_signs trump_trump sign_trump signs_signs harris',
'77_immigration_immigrate_permanent residency_visas',
'78_liz cheney_like liz cheney_campaigning liz cheney_cheneys',
'79_brain worm_brain worms_brainworm_worm',
'80_crosses_cross_jerusalem_crusaders',
'81_economist_economists_economics_better economy',
'82_liberal echo chamber_echo chambers_politics reddit_conservative subreddit',
'83_laws_law_law law_laws really',
'84_celebrity endorsement_celebrity endorsements_endorsements_endorsement',
'85_dementia_alzheimer_dementia don_cognitive decline',
'86_reddit_reddit reddit_reddit does_reddits',
'87_trump_trump supporter_trump supporters_voted trump',
'88_newt gingrich_gingrich_newt_politics',
'89_bezos_like bezos_jeff bezos_amazon',
'90_fuck em_fuck people_fuck fuck_fuck idiots',
'91_jail trump_prison trump_trump criminal_trump prison',
'92_blowing microphone_blow microphone_microphone say_microphone',
'93_wins trump_trump winning_win trump_trump wins',
'94_john bolton_bolton_john_patriot',
'95_oligarchy_oligarchy usa_global oligarchy_oligarchic',
'96_vote blue_voting blue_vote blue voting_blue vote',
'97_pence_mike pence_trump pence_republican',
'98_ai_artificial intelligence_train ai_ai generated',
'99_civil war_civil war like_war civil_start civil war',
'100_stephen miller_stephen miller trump_steven miller_miller',
'101_rid filibuster_filibuster_does senate_senate',
'102_tiktoks_tiktok_youtube tiktok_news tiktok',
'103_fema_hurricanes_hurricane_trump',
'104_magats_magat_magats won_maggots',
'105_woke_woke stuff_wokeism_anti woke',
'106_leopardsatemyface_leopards face_leopard_leopards',
'107_garbage_garbage yeah_say garbage_garbage garbage',
'108_bots_bot_trolls_shills',
'109_combat roles_women combat roles_women combat_combat',
'110_black voters_voters black_black vote_vote black',
'111_jim jordan_jordan_jim_like jim',
'112_mark robinson_robinson_picked_lose',
'113_voting age_voters young_younger voters_young voters',
'114_police_cops_policing_police officers',
'115_declared victory_didn won_surprised hasn_victory said',
'116_echo chamber reddit_reddit echo chamber_echo chamber_echo chambers',
'117_royalty_nobility_aristocracy_titles nobility',
'118_wins_win win_win_winning',
'119_lottery_fraud_committing fraud_lotteries',
'120_vote felon_voting felon_voted felon_convicted felon president',
'121_let burn_burn_burn let_burning']
25, 100
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(25, 100, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:34:33,560 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:34:49,993 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:34:49,995 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:34:40.136982] Transform can only be run with brute force. Using brute force.
2024-12-16 05:35:34,871 - BERTopic - Cluster - Completed ✓
2024-12-16 05:35:34,903 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:35:54,443 - BERTopic - Representation - Completed ✓
100%|██████████| 98/98 [00:26<00:00, 3.72it/s]
16it [01:28, 5.51s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_election_voting_voters_biden',
'0_stupid_happen_dumb_lol',
'1_democrats_blame democrats_democrat_democrats need',
'2_misogyny_women_women vote_women voted',
'3_tariffs_economy_tariff_economics',
'4_russia_like russia_russians_putin',
'5_orange vest_orange_man_face',
'6_musk_elon musk_musk trump_trump musk',
'7_voted harris_harris did_voting harris_vote harris',
'8_vote_vote vote_vote vote vote_just vote',
'9_trump voters_voting trump_trump supporters_people voted trump',
'10_hillary_woman_lady_did',
'11_nazi_nazis_fascist_fascists',
'12_gaza_gazans_palestinians_palestine',
'13_christians_christianity_christian_religious',
'14_rfk jr_rfk_kennedy_jr',
'15_saying biden_trump biden_biden_biden biden',
'16_kamala_kamala lost_kamala going_vote kamala',
'17_media outlets_fox news_media_mainstream media',
'18_illegal immigrant_illegal immigrants_illegal immigration_illegals',
'19_trans women_transgender people_trans people_transgender',
'20_america_america really_america america_fuck america',
'21_got elected_voted_didn vote_won popular vote',
'22_vance_vance doesn_vance vance_jd vance',
'23_rapist president_trump rapist_rapist_voted rapist',
'24_obamacare_repeal aca_health insurance_affordable care act',
'25_national guard_national guards_military_guard',
'26_joe rogan_rogan_rogan just_joe rogan podcast',
'27_maga essentially_maga_maga like_maga maga',
'28_supreme court_supreme courts_justices_supreme court justices',
'29_matt gaetz_gaetz confirmed_like gaetz_gaetz',
'30_states blue states_states red states_red states_blue states',
'31_project 2025 isn_project 2025 said_project 2025_project 2025 just',
'32_going prison_going jail_incarcerated_jail',
'33_democracy_democracy dead_vote democracy_democracy democracy',
'34_education_schooling_public education_schools',
'35_gen xers_genz_gen gen_gen doesn',
'36_leopards_leopards ate face_faces eaten leopards_face eating leopards',
'37_canadians_canada_canadian_moving canada',
'38_iowa_wins iowa_won iowa_iowa poll',
'39_guns_buy gun_firearms_firearm',
'40_epstein_epstein trump_epsteins_trump epstein',
'41_voter fraud_election fraud_fraud election_rigged election',
'42_eggs_eggs yeah_yeah eggs_eggs just',
'43_puerto rico joke_called puerto rico_puerto ricans_puerto rican',
'44_headline_accurate headline_misleading headline_read headline',
'45_white person_white_white man_whites',
'46_recess appointments_make recess appointments_allow recess appointments_recess appointment',
'47_conservatives_conservatives like_liberals conservatives_conservative',
'48_texas_texas texas_tx_texas going',
'49_merrick garland_merrick garland ag_fuck garland_garland did',
'50_georgia_ga_state georgia_ga nc',
'51_betting markets_betting market_people betting_betting',
'52_reagan_reagan did_voted reagan_ronald reagan',
'53_reddit echo chamber_echo chamber reddit_echo chamber_echo chambers',
'54_latino_hispanic latino_latinos_latino people',
'55_twitter_twitter twitter_deleted twitter_twitter used',
'56_cooked_cooked just_cook_meals',
'57_tim walz_walz actually_walz_like walz',
'58_jill stein_jill stein just_voted jill stein_voting jill stein',
'59_fbi_fbi background_fbi background checks_trump fbi',
'60_tucker carlson_tucker really_tucker_like tucker',
'61_war hawks_trump said_guns trained_military',
'62_mom_mom dad_parent_mother',
'63_veterans trump_benefits veterans_veterans voted_veterans benefits',
'64_pardon trump_trump pardon_pardoned trump_president pardon',
'65_newsom_like newsom_think newsom_2028 newsom',
'66_porn_pornographic_pornography_porn just',
'67_garbage truck_truck garbage_trash truck_drive garbage truck',
'68_law yeah_law isn_law_law illegal',
'69_pump dump_pumping stock_dumping money_dump',
'70_states rights_state rights_states rights states_rights states',
'71_fluoride water_water fluoridation_fluoride drinking water_fluoride',
'72_crosses_cross_jerusalem_crusaders',
'73_idiocracy_living idiocracy_idiocracy america_movie idiocracy',
'74_prison trump_trump prison_trump jail_sentence trump',
'75_parents trump_mom voted_voted trump_maga family',
'76_gay_gay gay_people gay_gay people',
'77_brain worm_worm brain_brain worms_worm ate brain',
'78_liz cheney_like liz cheney_campaigning liz cheney_dick liz cheney',
'79_celebrity endorsement_celebrity endorsements_endorsements like_endorsement like',
'80_fema workers_fema just_fema_maga',
'81_climate change_climate change going_like climate change_climate change real',
'82_newt gingrich_gingrich_newt_paul ryan',
'83_dictator_dictator day_said dictator day_dictator life',
'84_blue blue wave_blue wave_wave blue_red wave blue',
'85_signs trump_signs trump signs_trump sign_trump signs',
'86_john bolton_john bolton dick_bolton dick cheney_bolton suggests',
'87_economist_economists_economically liberal_liberal',
'88_pence_trump pence_mike pence_pence right',
'89_getting rid filibuster_rid filibuster_eliminating filibuster_filibuster',
'90_blew mic_blowing mic_mic_blow mic',
'91_garbage_yeah garbage_trash garbage_let garbage',
'92_oligarchy_oligarchy just_american oligarchy_america oligarchy',
'93_magats really_magats_magats going_thing magats',
'94_clown_clowns_clowned_clowning',
'95_vote blue_voting blue_just vote blue_blue vote',
'96_illegal lottery_lottery illegal_running illegal lottery_lottery',
'97_businessman_richest man_richest_billionaire',
'98_tiktok_tiktoks_youtube tiktok_news tiktok']
30, 50
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(30, 50, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 00:33:35,749 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 00:33:53,034 - BERTopic - Dimensionality - Completed ✓
2024-12-16 00:33:53,038 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [00:33:42.970510] Transform can only be run with brute force. Using brute force.
2024-12-16 00:34:35,729 - BERTopic - Cluster - Completed ✓
2024-12-16 00:34:35,771 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 00:35:01,662 - BERTopic - Representation - Completed ✓
100%|██████████| 184/184 [00:26<00:00, 6.93it/s]
16it [01:45, 6.61s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_biden_republicans_democrats_republican',
'0_bernie_bernie sanders_democrats_blame democrats',
'1_voted harris_votes harris_voting harris_vote harris',
'2_candidate_campaign_election_democrat',
'3_russia_russians_russian_like russia',
'4_israel palestine_gaza_palestinians_palestine',
'5_women_women women_women just_vote women',
'6_trump_trump trump_trump like_know trump',
'7_trump tariffs_inflation_tariffs work_tariffs',
'8_biden_biden biden_blame biden_joe biden',
'9_christian_christianity_christians_religion',
'10_kamala going_vote kamala_kamala wins_kamala win',
'11_vote_vote vote_vote voting_voting',
'12_rfk jr_rfk_kennedy jr_jr',
'13_fox news_fox_mainstream media_media outlets',
'14_abortion_abortion laws_abortion bans_abortion ban',
'15_won popular vote_popular vote_didn vote_did vote',
'16_thanks_thank_thanks sharing_appreciate',
'17_trans people_transgender_transgender people_transphobic',
'18_elon musk_musk_musk trump_like musk',
'19_illegal immigrants_immigrants_illegal immigrant_illegals',
'20_rapist_child rapist_convicted rapist_rape',
'21_vance_vance going_vance said_trump vance',
'22_america america_america_america fucked_americans',
'23_supreme court_supreme court justices_conservative supreme court_justices',
'24_joe rogan_rogan_joe_like joe',
'25_hitler_trump hitler_hitlers_hitler did',
'26_isn going_didn say_doesn_liar',
'27_matt gaetz_gaetz_gaetz ag_matt',
'28_fascist_fascism_fascists_fascism trump',
'29_elon_like elon_know elon_elon just',
'30_project 2025 going_project 2025 just_project 2025_read project 2025',
'31_iowa poll_iowa_polls_polling',
'32_national guard_military_guard_use military',
'33_voter fraud_election fraud_rigged election_election rigged',
'34_blue states_states blue_blue state_red states',
'35_leopards_leopards eating_leopards ate_leopards eat',
'36_democracy_democracy isn_save democracy_end democracy',
'37_guns_firearms_weapons_gun',
'38_maga_maga just_maga doesn_know maga',
'39_obamacare_affordable care act_health insurance_insurance',
'40_yes did_sure did_did yes_did did',
'41_genz_gen voters_millennials_gen',
'42_mail voting_mail ballot_mail ballots_vote mail',
'43_eggs_eggs just_egg_eggs expensive',
'44_education_department education_schooling_educational',
'45_epstein_trump epstein_jeffrey epstein_michael',
'46_puerto ricans_puerto rico_puerto rican_puerto',
'47_texas_tx_texas florida_state texas',
'48_conservatives_conservatives like_conservatives think_conservative',
'49_dogs cats_eating dogs_eating cats_cats dogs',
'50_people stupid_people fucking stupid_stupidity_stupid people',
'51_really fucked_fucked_fucked going_fucked just',
'52_headline_headline like_read headline_headlines like',
'53_white people_white person_white middle class_whites',
'54_canada_canadian_trudeau_america',
'55_going prison_prison_goes prison_stay prison',
'56_merrick garland_garland_merrick_congress',
'57_tim walz_walz_tim_like kamala',
'58_rich people_wealthy_rich_rich rich',
'59_georgia_ga_ga nc_atlanta',
'60_reagan_ronald reagan_regan_republican',
'61_twitter_twitter did_twitter account_twitter people',
'62_betting_bets_betting trump_bet',
'63_jill stein_candidate primary_candidate_party candidate',
'64_echo chambers_echo chamber_echo chamber reddit_reddit echo chamber',
'65_stock_stock market_investors_buy',
'66_mom_mom told_mother_mothers',
'67_fbi background_fbi_background checks_security clearance',
'68_latino voters_latino vote_latinos trump_latinos',
'69_shocked shocked_shocked_shocking_shocked vote',
'70_porn_ban porn_porn stars_porn star',
'71_guns trained_war hawks_guns_shooting',
'72_orange_orange face_orange man_orange man bad',
'73_tucker carlson_tucker_like tucker_carlson',
'74_taxes_tax_taxation_paying taxes',
'75_laws_laws people_law_follow laws',
'76_clown_clowns_clown just_fucking clown',
'77_gay_gays_gay men_normal gay',
'78_pardon_pardon trump_pardoning_pardons',
'79_joking_joke_joked_like joke',
'80_recess_appointments_appointments trump_appointment',
'81_parents voted_voted trump_voting trump_trump supporter',
'82_2016_year 2016_2016 2020_happened 2016',
'83_brain worm_brain worms_worm_worms',
'84_tulsi gabbard_gabbard_tulsi_russian asset',
'85_liz cheney_like liz cheney_campaigning liz cheney_cheney',
'86_blue wave_blue blue_blue_wave',
'87_states rights_state rights_rights state_state right',
'88_garbage truck_truck_garbage_trash',
'89_crosses_cross_crusade_christianity',
'90_4yrs_years left_years years_years going',
'91_fluoride_water_chemicals_drink',
'92_idiocracy_idiocy_stupid_idiots',
'93_economist_economists_economics_endorsed harris',
'94_celebrity endorsements_endorsements_endorsement_endorsed',
'95_fema_hurricane_hurricanes_storms',
'96_gingrich_newt_paul ryan_politics',
'97_veterans_voted trump_veteran_trump',
'98_garbage_garbage garbage_garbage like_called garbage',
'99_climate change_global warming_climate_warming',
'100_ai_ai generated_artificial_robot',
'101_leak_leaking_leaked_leaks',
'102_pence_mike pence_republican_republicans',
'103_newsom_gavin newsom_newsome_gavin newsome',
'104_housing_housing market_housing costs_affordable housing',
'105_let burn_burn_burned_burning',
'106_woke_anti woke_wokeness_wake',
'107_win_wins_win win_winning',
'108_vote blue_voting blue_voted blue_vote',
'109_filibuster_house senate_senate_senate house',
'110_john bolton_bolton_worse trump_trump bad',
'111_signs trump_trump sign_harris signs_trump signs',
'112_oligarchy_oligarchs_oligarch_billionaire oligarchs',
'113_stephen miller_steven miller_miller_stephen',
'114_dementia_dementia don_demented_cognitive decline',
'115_police_cops_police officers_policing',
'116_richest man_richest man world_richest person_richest',
'117_mic_blowing microphone_mic stand_microphone',
'118_voting age_younger voters_young voters_youth vote',
'119_robinson_mark_win_didn win',
'120_bezos_jeff bezos_amazon_billionaire',
'121_civil war_american civil_confederate_war',
'122_sad_just sad_really sad_actually sad',
'123_fingers crossed_sure hope_hope right_soon',
'124_tiktok_youtube tiktok_tik tok_tik',
'125_dictator day_dictator_going dictator_dictators',
'126_cheated wife_wives_wife_married',
'127_social security_medicare social security_social security medicare_medicare social',
'128_spain_immigration_visas_immigrate',
'129_combat_military_active duty_duty',
'130_taliban_afghanistan_american_maga',
'131_magats_magat_fuckin_says',
'132_free fair election_fair election_fair elections_election fair',
'133_convicted felon_convicted felons_felon president_felon',
'134_jordan_jordan peterson_gym_jim',
'135_pelosi_nancy pelosi_democrats_bernie',
'136_working class_working class people_working class americans_working class american',
'137_owning libs_libs_libs just_lib',
'138_podcasters_podcaster_podcast_podcasts',
'139_late_far late_way late_little late',
'140_cheating_cheated_cheat_cheat steal',
'141_monarchy_prince_royal_throne',
'142_vote day_voting day_election day_day election day',
'143_traitors_traitor_treason_treasonous',
'144_fuck em_fuckers_fuck people_fuck fuck',
'145_economy democrats_republicans better_republicans_better economy',
'146_herschel walker_missile defense_missile_missiles',
'147_golf_golf course_playing golf_play golf',
'148_impeached_twice impeached_impeachment_impeach',
'149_trump pays_money trump_billionaire trump_trump bought',
'150_collins_susan_shocked_worst person',
'151_video_videos_video clip_watching video',
'152_recount_recounts_votes_ballots',
'153_lindsey graham_graham_lindsey_donald trump going',
'154_lottery_fraud_rigged_scam',
'155_reddit_reddit reddit_reddit just_reddits',
'156_unintelligent_disgusting person_behavior_humiliation',
'157_lawsuits_lawsuit_law school_law',
'158_brexit_britain_uk_referendum',
'159_free speech_freedom speech_speech_freedom',
'160_won isn_won_winning_like won',
'161_china taiwan_china_taiwan_chinese',
'162_alex jones_jones_alex_infowars',
'163_slavery_slaves_slave labor_slave',
'164_bots_bot_robots_trolls',
'165_going lose_gonna lose_losing_loses',
'166_black voters_black vote_trump black_trump racist',
'167_onion_satire_shit says_hilarious',
'168_drain swamp_swamp_draining_drained',
'169_example trump_trump_criminal_trump did',
'170_john oliver_oliver_john_says',
'171_couch_couches_sit couch_furniture',
'172_mitch mcconnell_mcconnell_mitch_senator',
'173_gerrymandering_redistricting_electoral college_electoral',
'174_lie_lying_lied_lies',
'175_dc_washington dc_washington_voted democratic',
'176_citizens united_citizen united_citizens_citizen',
'177_mooch_dog_instead_continue',
'178_worked trump_despises trump_trump fully_terms trump',
'179_bought twitter_buy twitter_twitter did_twitter wasn',
'180_copium_hopium_smoking_share',
'181_hulk hogan_hogan_hulk_wwe',
'182____',
'183_electoral college_electoral_electors_voting',
'184_projection_projecting_projected_inception']
30, 75
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(30, 75, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:03:21,353 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:03:38,813 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:03:38,815 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:03:27.917795] Transform can only be run with brute force. Using brute force.
2024-12-16 05:04:27,475 - BERTopic - Cluster - Completed ✓
2024-12-16 05:04:27,504 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:04:49,547 - BERTopic - Representation - Completed ✓
100%|██████████| 119/119 [00:22<00:00, 5.26it/s]
16it [01:29, 5.59s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_candidate_voters_trump',
'0_lol_haha_good_thank',
'1_tariffs_inflation_tariff_gas prices',
'2_bernie_democrats_democrat_bernie sanders',
'3_musk_elon musk_like musk_trump musk',
'4_trump_trump trump_trump going_trump just',
'5_vote harris_voting harris_voted harris_harris campaign',
'6_hillary_shes_woman_trump',
'7_putin_russia_trump putin_putins',
'8_gaza_palestinians_palestine_hamas',
'9_vote_vote vote_voting_vote vote vote',
'10_christians_christian_christianity_christian values',
'11_rfk_rfk jr_kennedy_jfk',
'12_joe biden_biden_biden biden_biden called',
'13_vote kamala_voting kamala_voted kamala_kamala campaign',
'14_fox news_fox_mainstream media_msnbc',
'15_going lose_lose_loses_losing',
'16_illegal immigrants_illegal immigration_illegals_deportations',
'17_won popular vote_did vote_win popular vote_didn vote',
'18_abortion_ban abortion_abortion bans_abortion ban',
'19_trans people_transgender people_trans woman_trans rights',
'20_america_america america_americans_american',
'21_rapist_voted rapist_trump rapist_pedophile rapist',
'22_vance_vance vance_like vance_vance going',
'23_obamacare_affordable care act_repeal aca_health insurance',
'24_joe rogan_like joe rogan_rogan_rogan just',
'25_nazi_nazis_hitler_trump hitler',
'26_supreme court_justices_supreme court trump_supreme court justices',
'27_fascist_fascism_fascists_fascist government',
'28_matt gaetz_gaetz_gaetz just_gaetz ag',
'29_women voted trump_women voted_women vote_women voting',
'30_project 2025 going_project 2025 just_project 2025_read project 2025',
'31_military_use military_military leadership_people military',
'32_genz_gen voters_gen_millennials',
'33_maga_maga maga_maga just_maga like',
'34_leopards eating faces_leopards ate face_leopards_leopard eating face',
'35_democracy_vote democracy_democracy democracy_democracy dead',
'36_women_women hate_hates women_women just',
'37_voter fraud_election fraud_fraud election_rigged election',
'38_education_department education_dept education_schooling',
'39_iowa poll_iowa_selzer poll_polls',
'40_mail voting_voting mail_early voting_vote early',
'41_epstein_epsteins_like epstein_release epstein',
'42_guns_firearms_gun ownership_gun',
'43_states red states_red states_blue states_states red',
'44_going prison_goes prison_going jail_prison',
'45_eggs_egg_eggs just_eggs won',
'46_canada_canadian_moving canada_usa',
'47_puerto rico comments_puerto ricans_puerto rican voters_puerto rican',
'48_white people_white middle class_white males_white man',
'49_twitter_use twitter_twitter facebook_twitter account',
'50_article isn_stupid article_article just_headline says',
'51_conservatives_conservative_conservatives just_liberals conservatives',
'52_reagan_reagan trump_ronald reagan_like reagan',
'53_merrick garland_biden garland_garland_merrick',
'54_betting markets_betting_bets_betting sites',
'55_latinos voted_latino vote_latino voters_latinos',
'56_texas_texas texas_tx_lose texas',
'57_jill stein_vote stein_voted stein_stein voters',
'58_tim walz_walz actually_walz didn_walz',
'59_echo chamber_echo chambers_echo chamber reddit_reddit echo chamber',
'60_ga nc_georgia_nc_ga',
'61_war hawks_guns trained_shooting_guns',
'62_liz cheney_liz cheney like_like liz cheney_dick liz cheney',
'63_orange vest_orange face_orange man_orange',
'64_tucker carlson_like tucker_tucker_carlson',
'65_kids_children_children don_like kids',
'66_porn_pornographic_pornography_porn site',
'67_fbi background checks_fbi background check_fbi background_fbi',
'68_peppers_sauce_cooking_spicy',
'69_law isn_law yeah_laws_law doesn',
'70_mitch mcconnell_mcconnell_republican senator_senate does',
'71_states rights_state rights_rights state_state right',
'72_pardon trump_pardoning_pardon_going pardon',
'73_recess appointments_recess appointments senate_recess_appointments trump',
'74_newsom_gavin newsom_newsome_probably',
'75_parents trump_voted trump_maga family_trump',
'76_crosses_cross_jerusalem_crusade',
'77_fluoride water_fluoride drinking water_fluoride_fluoride drinking',
'78_veterans trump_vets voted_veterans_veteran',
'79_tulsi gabbard_gabbard russian_tulsi_gabbard said',
'80_normal gay_gay_normal gay guy_normal gays',
'81_brain worm_brain worms_brainworm_brainworms',
'82_celebrity endorsements_endorsements_endorsement_endorsed',
'83_idiocracy_idiots_stupid_idiot truly',
'84_garbage truck_drive garbage truck_trash truck_driving garbage truck',
'85_climate change_climate change going_fight climate change_fight climate',
'86_newt gingrich_gingrich_newt_paul ryan',
'87_blue wave_red wave_blue tsunami_wave',
'88_clown_clown actually_fucking clown_clowns',
'89_economist_economists_economics_better economy',
'90_john bolton_bolton_patriot_worse trump',
'91_garbage_garbage oh_trash garbage_say garbage',
'92_filibuster_filibustered_rid filibuster_filibuster gone',
'93_dictator_dictator day_dictator life_dictators',
'94_free fair election_free fair elections_fair election_free election',
'95_pence_mike pence_trump pence_hang mike pence',
'96_oligarchy_america oligarchy_global oligarchy_oligarch',
'97_vote blue_voting blue_blue vote_blue voting',
'98_lottery illegal_illegal lottery_lottery_lotteries',
'99_mark robinson_robinson_mark_win',
'100_trump sign_signs trump_harris signs_trump signs',
'101_woke_woke stuff_anti woke_wokeism',
'102_fraud_fraud fraud_fraud say_fraud right',
'103_stephen miller_stephen miller trump_steven miller_miller',
'104_tiktoks_tiktok_tik tok_tik',
'105_jim jordan_jordan_jim_gym',
'106_microphone blow job_giving microphone_microphone blow_microphone',
'107_won likely_won_won surprised_win isn',
'108_ai_ai stuff_ai just_thing ai',
'109_civil war_start civil war_american civil war_confederate',
'110_let burn_burn_burning_burned',
'111_bots_bot_bots trolls_trolls',
'112_cops_police_police officers_policing',
'113_voting age_young people vote_young voters_younger voters',
'114_reddit_reddit does_reddit reddit_reddit just',
'115_got dementia_dementia_dementia riddled_alzheimer',
'116_magats_magats going_magat_magats don',
'117_second wife_cheating wife_wife_cheated wife',
'118_prison trump_jail trump_trump prison_trump jail',
'119_owning libs_owned libs_libs_just libs']
30, 100
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(30, 50, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:17:50,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:18:07,920 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:18:07,923 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:17:58.095146] Transform can only be run with brute force. Using brute force.
2024-12-16 05:18:51,560 - BERTopic - Cluster - Completed ✓
2024-12-16 05:18:51,592 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:19:14,737 - BERTopic - Representation - Completed ✓
100%|██████████| 182/182 [00:27<00:00, 6.71it/s]
16it [01:39, 6.19s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_trump_maga_republicans',
'0_bernie_bernie sanders_democrats_blame democrats',
'1_harris trump_trump harris_voted harris_vote harris',
'2_hillary_candidate_election_votes',
'3_gaza_support israel_israel palestine_palestinians',
'4_russia_russians_putin_like russia',
'5_biden_blame biden_biden fault_biden biden',
'6_christians_christian_christianity_religious',
'7_rfk jr_rfk_jr_kennedy',
'8_women_women women_ladies_women just',
'9_inflation_inflationary_raise prices_fed',
'10_national guard_military_use military_military coup',
'11_vote vote_vote_vote vote vote_voting',
'12_tells_tells like_says_said',
'13_kamala going_kamala_vote kamala_kamala campaign',
'14_fox news_fox_mainstream media_media outlets',
'15_trump trump_trump_trump president_trump isn',
'16_won popular vote_winning popular vote_vote won_win popular vote',
'17_ban abortion_abortion bans_abortion_abortion ban',
'18_musk_elon musk_musk trump_trump musk',
'19_trans people_transgender people_trans rights_transgender',
'20_illegal immigrants_illegal immigrant_illegal immigration_illegals',
'21_vance_vance said_trump vance_jd vance',
'22_texas_tx_states blue_texan',
'23_rapist_voted rapist_child rapist_rapists',
'24_america_america america_america fuck_america really',
'25_thank_thanks_thank sharing_welcome',
'26_hitler_trump hitler_nazi_nazis',
'27_fascist_fascism_fascists_fascism trump',
'28_supreme court_supreme court justices_justices_supreme court ruling',
'29_joe rogan_rogan_like rogan_joe',
'30_matt gaetz_gaetz_matt_gabbard',
'31_elon_like elon_elon said_elon just',
'32_voter fraud_election fraud_rigged election_election rigged',
'33_project 2025 going_project 2025_project 2025 just_read project 2025',
'34_maga idiots_maga morons_maga garbage_fuck maga',
'35_genz_gen_millennials_generation',
'36_democracy isn_democracy_isn democracy_vote democracy',
'37_leopards ate face_leopards_leopardsatemyface_leopards ate',
'38_education_public education_schooling_educate',
'39_obamacare_affordable care act_health insurance_healthcare plan',
'40_iowa poll_iowa_polls_polling',
'41_yes did_did yes_sure did_actually did',
'42_guns_firearms_armed_gun',
'43_epstein_jeffrey epstein_michael_hollywood',
'44_eggs_egg_price eggs_chickens',
'45_going prison_incarcerated_prison_prison sentence',
'46_people stupid_stupidity people_stupid people_stupidity',
'47_puerto rico_puerto rican_puerto ricans_puerto',
'48_headline_headline like_like headline_read headline',
'49_early voting_mail voting_voting early_vote early',
'50_sad_really sad_just sad_fucking sad',
'51_twitter_twitter account_like twitter_tweet',
'52_white people_people white_white person_whites',
'53_conservatives_conservative_conservatives liberals_liberals conservatives',
'54_merrick garland_garland_merrick_partisan',
'55_echo chamber_echo chambers_reddit echo chamber_reddit echo',
'56_nc ga_ga nc_georgia_nc',
'57_reagan_reagan trump_ronald reagan_reagan won',
'58_canada_canadian_usa_america',
'59_betting_bets_place bets_betting trump',
'60_orange_orange man_orange fuck_orange face',
'61_tim walz_walz_tim_didn',
'62_jill stein_candidate_party candidate_candidates',
'63_billionaires_billionaires don_billionaire_billionaire class',
'64_latino voters_latinos_latino_hispanics',
'65_war hawks_guns_shooting_troops',
'66_cooked_cook_food_meat',
'67_2016_happened 2016_2016 2020_2020 2016',
'68_mom_mother_mum_parent',
'69_shocked_shocked shocked_shocking_shocked just',
'70_states rights_state rights_rights state_state right',
'71_liz cheney_cheney_cheneys_dick cheney',
'72_tucker carlson_tucker_like tucker_carlson',
'73_fbi_background checks_background check_security clearance',
'74_joking_joke_just joke_joke did',
'75_porn_pornography_ban porn_porn star',
'76_newsom_gavin newsom_newsome_whitmer',
'77_law isn_laws people_law yeah_laws',
'78_stock_stock market_sells_shareholders',
'79_pardon trump_pardon_pardoning_pardons',
'80_voting trump_voted trump_vote trump_voted trump 2016',
'81_taxes_tax_pay taxes_paying taxes',
'82_crosses_cross_crusade_crusades',
'83_garbage truck_truck_trucks_trash',
'84_gay_gay guy_gay man_gay men',
'85_tulsi gabbard_tulsi_gabbard_russian asset',
'86_idiocracy_idiocy_idiots_stupid',
'87_blue wave_wave_waves_blue',
'88_years years_years lol_years left_years',
'89_fluoride_water_drink_milk',
'90_brain worm_brain worms_worm_worms',
'91_recess_appointments_appointment_appoint',
'92_celebrity endorsements_endorsements_endorsement_celebrities',
'93_future elections_election_election election_fair election',
'94_climate change_global warming_climate_climate action',
'95_clown_clowns_fucking clown_clown car',
'96_economist_economists_economics_endorsed',
'97_newt_moral_paul ryan_ryan',
'98_affordable housing_housing crisis_housing_housing market',
'99_let burn_burn_burning_burned',
'100_fema_hurricane_hurricanes_disasters',
'101_garbage_garbage garbage_called garbage_calling garbage',
'102_blame_continue blame_say blame_don blame',
'103_john bolton_bolton_trump bad_patriot',
'104_pence_mike pence_republican_republicans',
'105_reddit_reddit reddit_reddit just_reddit like',
'106_tiktok_tik tok_tik_tweets',
'107_drain swamp_swamp_drained_clearly trump',
'108_woke_anti woke_wokeness_wake',
'109_lottery_fraud_fraudulently_fraudulent',
'110_vote blue_voting blue_blue vote_voted blue',
'111_mic_mic stand_microphones_microphone',
'112_cops_police_policing_cop',
'113_declaring victory_declare victory_win isn_won isn',
'114_oligarchy_oligarch_oligarchs_billionaire oligarchs',
'115_stephen miller_miller_stephen_trump soon',
'116_dementia_alzheimer_dementia don_demented',
'117_civil war_american civil_confederate_confederacy',
'118_signs trump_trump sign_trump signs_trump ones',
'119_businessman_richest man_doing business_business',
'120_filibuster_senate_house senate_senators',
'121_owning libs_owned libs_libs_libs just',
'122_fraud_fraudulent_fraudster_fraud claims',
'123_election day_vote day_voting day_day election',
'124_mark robinson_robinson_couldn vote_picked trump',
'125_trump won_wins trump_trump wins_trump win',
'126_jim jordan_jordan_jim_gym',
'127_twice impeached_impeached_impeachment_impeach',
'128_ai_artificial_bot_intelligent',
'129_fingers crossed_hoping_looking forward_soon',
'130_magats_magat_fuckin_frat',
'131_convicted felon_felon_felons_felonies',
'132_bots_bot_trolls_ai',
'133_taliban_afghanistan_american_fought',
'134_leak_leaking_leaked_leaks',
'135_crime trump_trump prison_trump_trump committed',
'136_young voters_younger voters_voting age_youth vote',
'137_late_better late_far late_little late',
'138_wife_wives_married_ex wife',
'139_percent_percentages_15_rate',
'140_bezos_jeff bezos_amazon_billionaires',
'141_pelosi_nancy pelosi_feel democrats_bernie',
'142_concepts plan_concept plan_plan plan_concepts',
'143_missile defense_missile_missiles_walker',
'144_collins_susan_shocked_worst person',
'145_newsweek_newsweek article_journalism_bothered read',
'146_lindsey graham_graham_lindsey_lindsay',
'147_social security_medicare social security_social security medicare_medicare social',
'148_palpatine_darth_vader_star wars',
'149_monarchy_aristocracy_orders_order',
'150_fuck em_fuck people_fuck fuck_fuck',
'151_frivolous lawsuits_lawsuits_lawsuit_suing',
'152_working class_working class american_help working class_class working class',
'153_onion_actual news_satire_shit says',
'154_mitch mcconnell_mcconnell_mitch_senator',
'155_alex jones_infowars_jones_alex',
'156_electoral college_electoral_electors_elections',
'157_china_taiwan_chinese_world',
'158_isn going win_win pa_wins_kamala win',
'159_recount_recounts_ballots_votes',
'160_leon_involved_paid_corpse',
'161_podcasters_podcaster_podcast_podcasts',
'162_justice_injustices_injustice_court justice',
'163_john oliver_oliver_john_says',
'164_cheating_cheated_cheat_cheats',
'165_couch_couches_furniture_like jd',
'166_video clip_clip_clips_watched clip',
'167_rules thee_rule_rules just_make rules',
'168_domestic terrorists_domestic terrorism_domestic terrorist_terrorists',
'169_great idea_idea__',
'170_drunk_drinking_sober_alcohol',
'171_bought twitter_twitter got_twitter_twitter don',
'172_golf_golf course_play golf_golfing',
'173_immigration_passport_visas_visa',
'174_elmo_furry_twitter_created',
'175_citizens united_citizen united_citizens_citizen',
'176_economy trump_economy bad_republican voters_democrats better',
'177_political party_parties_party reason_major parties',
'178_camera_cameras_man_bad man',
'179_dc_democrat_voted democrat_maryland',
'180_worked trump_despises trump_trump fully_terms trump',
'181_brexit_britain_uk_eu',
'182_copium_hopium_smoking_quite bit']